import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
ins_data = pd.read_csv('https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Insurance%20Claim%20Fraud%20Detection/Automobile_insurance_fraud.csv')
ins_data
| 328 | 48 | 521585 | 17-10-2014 | OH | 250/500 | 1000 | 1406.91 | 0 | 466132 | ... | 2 | YES.1 | 71610 | 6510 | 13020 | 52080 | Saab | 92x | 2004 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 228 | 42 | 342868 | 27-06-2006 | IN | 250/500 | 2000 | 1197.22 | 5000000 | 468176 | ... | 0 | ? | 5070 | 780 | 780 | 3510 | Mercedes | E400 | 2007 | Y |
| 1 | 134 | 29 | 687698 | 06-09-2000 | OH | 100/300 | 2000 | 1413.14 | 5000000 | 430632 | ... | 3 | NO | 34650 | 7700 | 3850 | 23100 | Dodge | RAM | 2007 | N |
| 2 | 256 | 41 | 227811 | 25-05-1990 | IL | 250/500 | 2000 | 1415.74 | 6000000 | 608117 | ... | 2 | NO | 63400 | 6340 | 6340 | 50720 | Chevrolet | Tahoe | 2014 | Y |
| 3 | 228 | 44 | 367455 | 06-06-2014 | IL | 500/1000 | 1000 | 1583.91 | 6000000 | 610706 | ... | 1 | NO | 6500 | 1300 | 650 | 4550 | Accura | RSX | 2009 | N |
| 4 | 256 | 39 | 104594 | 12-10-2006 | OH | 250/500 | 1000 | 1351.10 | 0 | 478456 | ... | 2 | NO | 64100 | 6410 | 6410 | 51280 | Saab | 95 | 2003 | Y |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | 3 | 38 | 941851 | 16-07-1991 | OH | 500/1000 | 1000 | 1310.80 | 0 | 431289 | ... | 1 | ? | 87200 | 17440 | 8720 | 61040 | Honda | Accord | 2006 | N |
| 995 | 285 | 41 | 186934 | 05-01-2014 | IL | 100/300 | 1000 | 1436.79 | 0 | 608177 | ... | 3 | ? | 108480 | 18080 | 18080 | 72320 | Volkswagen | Passat | 2015 | N |
| 996 | 130 | 34 | 918516 | 17-02-2003 | OH | 250/500 | 500 | 1383.49 | 3000000 | 442797 | ... | 3 | YES | 67500 | 7500 | 7500 | 52500 | Suburu | Impreza | 1996 | N |
| 997 | 458 | 62 | 533940 | 18-11-2011 | IL | 500/1000 | 2000 | 1356.92 | 5000000 | 441714 | ... | 1 | YES | 46980 | 5220 | 5220 | 36540 | Audi | A5 | 1998 | N |
| 998 | 456 | 60 | 556080 | 11-11-1996 | OH | 250/500 | 1000 | 766.19 | 0 | 612260 | ... | 3 | ? | 5060 | 460 | 920 | 3680 | Mercedes | E400 | 2007 | N |
999 rows × 39 columns
ins_data.shape
(999, 39)
There are 999 rows and 39 columns in our dataset.
ins_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 328 999 non-null int64 1 48 999 non-null int64 2 521585 999 non-null int64 3 17-10-2014 999 non-null object 4 OH 999 non-null object 5 250/500 999 non-null object 6 1000 999 non-null int64 7 1406.91 999 non-null float64 8 0 999 non-null int64 9 466132 999 non-null int64 10 MALE 999 non-null object 11 MD 999 non-null object 12 craft-repair 999 non-null object 13 sleeping 999 non-null object 14 husband 999 non-null object 15 53300 999 non-null int64 16 0.1 999 non-null int64 17 25-01-2015 999 non-null object 18 Single Vehicle Collision 999 non-null object 19 Side Collision 999 non-null object 20 Major Damage 999 non-null object 21 Police 908 non-null object 22 SC 999 non-null object 23 Columbus 999 non-null object 24 9935 4th Drive 999 non-null object 25 5 999 non-null int64 26 1 999 non-null int64 27 YES 999 non-null object 28 1.1 999 non-null int64 29 2 999 non-null int64 30 YES.1 999 non-null object 31 71610 999 non-null int64 32 6510 999 non-null int64 33 13020 999 non-null int64 34 52080 999 non-null int64 35 Saab 999 non-null object 36 92x 999 non-null object 37 2004 999 non-null int64 38 Y 999 non-null object dtypes: float64(1), int64(17), object(21) memory usage: 304.5+ KB
columns = ins_data.columns
print(columns)
Index(['328', '48', '521585', '17-10-2014', 'OH', '250/500', '1000', '1406.91',
'0', '466132', 'MALE', 'MD', 'craft-repair', 'sleeping', 'husband',
'53300', '0.1', '25-01-2015', 'Single Vehicle Collision',
'Side Collision', 'Major Damage', 'Police', 'SC', 'Columbus',
'9935 4th Drive', '5', '1', 'YES', '1.1', '2', 'YES.1', '71610', '6510',
'13020', '52080', 'Saab', '92x', '2004', 'Y'],
dtype='object')
# Create a dictionary mapping old column names to new descriptive names
column_mapping = {
'328': 'months_as_customer',
'48': 'age',
'521585': 'policy_number',
'17-10-2014': 'policy_bind_date',
'OH': 'policy_state',
'250/500': 'policy_csl',
'1000': 'policy_deductible',
'1406.91': 'policy_annual_premium',
'0': 'umbrella_limit',
'466132': 'insured_zip',
'MALE': 'insured_sex',
'MD': 'insured_education_level',
'craft-repair': 'insured_occupation',
'sleeping': 'insured_hobbies',
'husband': 'insured_relationship',
'53300': 'capital-gains',
'0.1': 'capital-loss',
'25-01-2015': 'incident_date',
'Single Vehicle Collision': 'incident_type',
'Side Collision': 'collision_type',
'Major Damage': 'incident_severity',
'Police': 'authorities_contacted',
'SC': 'incident_state',
'Columbus': 'incident_city',
'9935 4th Drive': 'incident_location',
'5': 'incident_hour_of_the_day',
'1': 'number_of_vehicles_involved',
'YES': 'property_damage',
'1.1': 'bodily_injuries',
'2': 'witnesses',
'YES.1': 'police_report_available',
'71610': 'total_claim_amount',
'6510': 'injury_claim',
'13020': 'property_claim',
'52080': 'vehicle_claim',
'Saab': 'auto_make',
'92x': 'auto_model',
'2004': 'auto_year',
'Y': 'fraud_reported'
}
# Step 1: Add the current column names as the first row of the DataFrame
ins_data.loc[-1] = ins_data.columns
ins_data.index = ins_data.index + 1 # Shift the index
ins_data = ins_data.sort_index() # Sort the DataFrame to put the new row at the top
# Step 2: Rename the columns using the column_mapping dictionary
ins_data.rename(columns=column_mapping, inplace=True)
# Display the updated DataFrame
print(ins_data.head())
months_as_customer age policy_number policy_bind_date policy_state \ 0 328 48 521585 17-10-2014 OH 1 228 42 342868 27-06-2006 IN 2 134 29 687698 06-09-2000 OH 3 256 41 227811 25-05-1990 IL 4 228 44 367455 06-06-2014 IL policy_csl policy_deductible policy_annual_premium umbrella_limit \ 0 250/500 1000 1406.91 0 1 250/500 2000 1197.22 5000000 2 100/300 2000 1413.14 5000000 3 250/500 2000 1415.74 6000000 4 500/1000 1000 1583.91 6000000 insured_zip ... witnesses police_report_available total_claim_amount \ 0 466132 ... 2 YES.1 71610 1 468176 ... 0 ? 5070 2 430632 ... 3 NO 34650 3 608117 ... 2 NO 63400 4 610706 ... 1 NO 6500 injury_claim property_claim vehicle_claim auto_make auto_model auto_year \ 0 6510 13020 52080 Saab 92x 2004 1 780 780 3510 Mercedes E400 2007 2 7700 3850 23100 Dodge RAM 2007 3 6340 6340 50720 Chevrolet Tahoe 2014 4 1300 650 4550 Accura RSX 2009 fraud_reported 0 Y 1 Y 2 N 3 Y 4 N [5 rows x 39 columns]
# ins_data.loc[-1] = ins_data.columns
# ins_data.index = ins_data.index + 1 # shifting index
# ins_data = ins_data.sort_index()
# # Step 2: Replace the column names with new names
# new_column_names = [f'Column_{i}' for i in range(1, len(ins_data.columns) + 1)]
# ins_data.columns = new_column_names
# print(ins_data)
Column_1 Column_2 Column_3 Column_4 Column_5 Column_6 Column_7 \
0 328 48 521585 17-10-2014 OH 250/500 1000
1 228 42 342868 27-06-2006 IN 250/500 2000
2 134 29 687698 06-09-2000 OH 100/300 2000
3 256 41 227811 25-05-1990 IL 250/500 2000
4 228 44 367455 06-06-2014 IL 500/1000 1000
.. ... ... ... ... ... ... ...
995 3 38 941851 16-07-1991 OH 500/1000 1000
996 285 41 186934 05-01-2014 IL 100/300 1000
997 130 34 918516 17-02-2003 OH 250/500 500
998 458 62 533940 18-11-2011 IL 500/1000 2000
999 456 60 556080 11-11-1996 OH 250/500 1000
Column_8 Column_9 Column_10 ... Column_30 Column_31 Column_32 Column_33 \
0 1406.91 0 466132 ... 2 YES.1 71610 6510
1 1197.22 5000000 468176 ... 0 ? 5070 780
2 1413.14 5000000 430632 ... 3 NO 34650 7700
3 1415.74 6000000 608117 ... 2 NO 63400 6340
4 1583.91 6000000 610706 ... 1 NO 6500 1300
.. ... ... ... ... ... ... ... ...
995 1310.8 0 431289 ... 1 ? 87200 17440
996 1436.79 0 608177 ... 3 ? 108480 18080
997 1383.49 3000000 442797 ... 3 YES 67500 7500
998 1356.92 5000000 441714 ... 1 YES 46980 5220
999 766.19 0 612260 ... 3 ? 5060 460
Column_34 Column_35 Column_36 Column_37 Column_38 Column_39
0 13020 52080 Saab 92x 2004 Y
1 780 3510 Mercedes E400 2007 Y
2 3850 23100 Dodge RAM 2007 N
3 6340 50720 Chevrolet Tahoe 2014 Y
4 650 4550 Accura RSX 2009 N
.. ... ... ... ... ... ...
995 8720 61040 Honda Accord 2006 N
996 18080 72320 Volkswagen Passat 2015 N
997 7500 52500 Suburu Impreza 1996 N
998 5220 36540 Audi A5 1998 N
999 920 3680 Mercedes E400 2007 N
[1000 rows x 39 columns]
ins_data.columns
Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
'policy_state', 'policy_csl', 'policy_deductible',
'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
'insured_education_level', 'insured_occupation', 'insured_hobbies',
'insured_relationship', 'capital-gains', 'capital-loss',
'incident_date', 'incident_type', 'collision_type', 'incident_severity',
'authorities_contacted', 'incident_state', 'incident_city',
'incident_location', 'incident_hour_of_the_day',
'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
'witnesses', 'police_report_available', 'total_claim_amount',
'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
'auto_model', 'auto_year', 'fraud_reported'],
dtype='object')
ins_data.info()
<class 'pandas.core.frame.DataFrame'> Index: 1000 entries, 0 to 999 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 months_as_customer 1000 non-null object 1 age 1000 non-null object 2 policy_number 1000 non-null object 3 policy_bind_date 1000 non-null object 4 policy_state 1000 non-null object 5 policy_csl 1000 non-null object 6 policy_deductible 1000 non-null object 7 policy_annual_premium 1000 non-null object 8 umbrella_limit 1000 non-null object 9 insured_zip 1000 non-null object 10 insured_sex 1000 non-null object 11 insured_education_level 1000 non-null object 12 insured_occupation 1000 non-null object 13 insured_hobbies 1000 non-null object 14 insured_relationship 1000 non-null object 15 capital-gains 1000 non-null object 16 capital-loss 1000 non-null object 17 incident_date 1000 non-null object 18 incident_type 1000 non-null object 19 collision_type 1000 non-null object 20 incident_severity 1000 non-null object 21 authorities_contacted 909 non-null object 22 incident_state 1000 non-null object 23 incident_city 1000 non-null object 24 incident_location 1000 non-null object 25 incident_hour_of_the_day 1000 non-null object 26 number_of_vehicles_involved 1000 non-null object 27 property_damage 1000 non-null object 28 bodily_injuries 1000 non-null object 29 witnesses 1000 non-null object 30 police_report_available 1000 non-null object 31 total_claim_amount 1000 non-null object 32 injury_claim 1000 non-null object 33 property_claim 1000 non-null object 34 vehicle_claim 1000 non-null object 35 auto_make 1000 non-null object 36 auto_model 1000 non-null object 37 auto_year 1000 non-null object 38 fraud_reported 1000 non-null object dtypes: object(39) memory usage: 312.5+ KB
ins_data.columns.to_list()
['months_as_customer', 'age', 'policy_number', 'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductible', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss', 'incident_date', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage', 'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make', 'auto_model', 'auto_year', 'fraud_reported']
ins_data.isnull().sum()
months_as_customer 0 age 0 policy_number 0 policy_bind_date 0 policy_state 0 policy_csl 0 policy_deductible 0 policy_annual_premium 0 umbrella_limit 0 insured_zip 0 insured_sex 0 insured_education_level 0 insured_occupation 0 insured_hobbies 0 insured_relationship 0 capital-gains 0 capital-loss 0 incident_date 0 incident_type 0 collision_type 0 incident_severity 0 authorities_contacted 91 incident_state 0 incident_city 0 incident_location 0 incident_hour_of_the_day 0 number_of_vehicles_involved 0 property_damage 0 bodily_injuries 0 witnesses 0 police_report_available 0 total_claim_amount 0 injury_claim 0 property_claim 0 vehicle_claim 0 auto_make 0 auto_model 0 auto_year 0 fraud_reported 0 dtype: int64
ins_data.fillna(ins_data['authorities_contacted'].mode()[0] , inplace = True)
ins_data.isnull().sum()
months_as_customer 0 age 0 policy_number 0 policy_bind_date 0 policy_state 0 policy_csl 0 policy_deductible 0 policy_annual_premium 0 umbrella_limit 0 insured_zip 0 insured_sex 0 insured_education_level 0 insured_occupation 0 insured_hobbies 0 insured_relationship 0 capital-gains 0 capital-loss 0 incident_date 0 incident_type 0 collision_type 0 incident_severity 0 authorities_contacted 0 incident_state 0 incident_city 0 incident_location 0 incident_hour_of_the_day 0 number_of_vehicles_involved 0 property_damage 0 bodily_injuries 0 witnesses 0 police_report_available 0 total_claim_amount 0 injury_claim 0 property_claim 0 vehicle_claim 0 auto_make 0 auto_model 0 auto_year 0 fraud_reported 0 dtype: int64
Here , We can see that there is no null values present in our Dataset.
# Dropping those columns , which is not in use.
ins_data.drop(columns = ['policy_number' , 'incident_location'] , axis = 1 , inplace = True)
ins_data.shape
(1000, 37)
for i in ins_data.columns :
print(ins_data[i].value_counts())
print("\n")
months_as_customer
194 8
128 7
210 7
140 7
285 7
..
467 1
414 1
352 1
347 1
17 1
Name: count, Length: 392, dtype: int64
age
43 49
39 48
41 45
34 44
30 42
38 42
31 42
37 41
33 39
40 38
32 38
29 35
46 33
36 32
42 32
44 32
35 32
28 30
45 26
26 26
27 24
48 24
47 24
57 16
55 14
25 14
49 14
50 13
53 13
61 10
54 10
24 10
60 9
51 9
56 8
58 8
23 7
21 6
59 5
62 4
52 4
64 2
63 2
20 1
48 1
19 1
22 1
Name: count, dtype: int64
policy_bind_date
01-01-2006 3
28-04-1992 3
05-08-1992 3
14-12-1991 2
09-08-2004 2
..
03-06-2014 1
12-12-1998 1
18-02-1999 1
30-10-1997 1
11-11-1996 1
Name: count, Length: 951, dtype: int64
policy_state
OH 352
IL 338
IN 310
Name: count, dtype: int64
policy_csl
250/500 351
100/300 349
500/1000 300
Name: count, dtype: int64
policy_deductible
1000 350
500 342
2000 307
1000 1
Name: count, dtype: int64
policy_annual_premium
1558.29 2
1215.36 2
1362.87 2
1073.83 2
1389.13 2
..
1085.03 1
1437.33 1
988.29 1
1238.89 1
766.19 1
Name: count, Length: 991, dtype: int64
umbrella_limit
0 797
6000000 57
5000000 46
4000000 39
7000000 29
3000000 12
8000000 8
9000000 5
2000000 3
10000000 2
0 1
-1000000 1
Name: count, dtype: int64
insured_zip
477695 2
469429 2
446895 2
431202 2
456602 2
..
476303 1
450339 1
476502 1
600561 1
612260 1
Name: count, Length: 995, dtype: int64
insured_sex
FEMALE 537
MALE 463
Name: count, dtype: int64
insured_education_level
JD 161
High School 160
Associate 145
MD 144
Masters 143
PhD 125
College 122
Name: count, dtype: int64
insured_occupation
machine-op-inspct 93
prof-specialty 85
tech-support 78
sales 76
exec-managerial 76
craft-repair 74
transport-moving 72
other-service 71
priv-house-serv 71
armed-forces 69
adm-clerical 65
protective-serv 63
handlers-cleaners 54
farming-fishing 53
Name: count, dtype: int64
insured_hobbies
reading 64
exercise 57
paintball 57
bungie-jumping 56
movies 55
golf 55
camping 55
kayaking 54
yachting 53
hiking 52
video-games 50
skydiving 49
base-jumping 49
board-games 48
polo 47
chess 46
dancing 43
sleeping 41
cross-fit 35
basketball 34
Name: count, dtype: int64
insured_relationship
own-child 183
other-relative 177
not-in-family 174
husband 170
wife 155
unmarried 141
Name: count, dtype: int64
capital-gains
0 508
46300 5
51500 4
68500 4
47600 3
...
36700 1
54900 1
69200 1
48800 1
50300 1
Name: count, Length: 339, dtype: int64
capital-loss
0 474
-53700 5
-31700 5
-50300 5
-45300 4
...
-64100 1
-50400 1
-29900 1
-91400 1
-82100 1
Name: count, Length: 355, dtype: int64
incident_date
02-02-2015 28
17-02-2015 26
07-01-2015 25
10-01-2015 24
04-02-2015 24
24-01-2015 24
19-01-2015 23
08-01-2015 22
13-01-2015 21
30-01-2015 21
12-02-2015 20
22-02-2015 20
31-01-2015 20
06-02-2015 20
21-02-2015 19
01-01-2015 19
23-02-2015 19
12-01-2015 19
14-01-2015 19
21-01-2015 19
03-01-2015 18
14-02-2015 18
01-02-2015 18
28-02-2015 18
20-01-2015 18
18-01-2015 18
25-02-2015 18
06-01-2015 17
09-01-2015 17
08-02-2015 17
24-02-2015 17
26-02-2015 17
13-02-2015 16
15-02-2015 16
16-02-2015 16
05-02-2015 16
16-01-2015 16
17-01-2015 15
18-02-2015 15
28-01-2015 15
15-01-2015 15
22-01-2015 14
20-02-2015 14
27-02-2015 14
23-01-2015 13
03-02-2015 13
27-01-2015 13
09-02-2015 13
04-01-2015 12
01-03-2015 12
26-01-2015 11
29-01-2015 11
02-01-2015 11
19-02-2015 10
11-02-2015 10
10-02-2015 10
07-02-2015 10
25-01-2015 10
11-01-2015 9
05-01-2015 7
Name: count, dtype: int64
incident_type
Multi-vehicle Collision 419
Single Vehicle Collision 403
Vehicle Theft 94
Parked Car 84
Name: count, dtype: int64
collision_type
Rear Collision 292
Side Collision 276
Front Collision 254
? 178
Name: count, dtype: int64
incident_severity
Minor Damage 354
Total Loss 280
Major Damage 276
Trivial Damage 90
Name: count, dtype: int64
authorities_contacted
Police 383
Fire 223
Other 198
Ambulance 196
Name: count, dtype: int64
incident_state
NY 262
SC 248
WV 217
VA 110
NC 110
PA 30
OH 23
Name: count, dtype: int64
incident_city
Springfield 157
Arlington 152
Columbus 149
Northbend 145
Hillsdale 141
Riverwood 134
Northbrook 122
Name: count, dtype: int64
incident_hour_of_the_day
17 54
3 53
0 52
23 51
16 49
10 46
4 46
13 46
6 44
14 43
9 43
21 42
18 41
12 40
19 40
7 40
15 39
22 38
8 36
20 34
5 32
2 31
11 30
1 29
5 1
Name: count, dtype: int64
number_of_vehicles_involved
1 580
3 358
4 31
2 30
1 1
Name: count, dtype: int64
property_damage
? 360
NO 338
YES 302
Name: count, dtype: int64
bodily_injuries
0 340
2 332
1 327
1.1 1
Name: count, dtype: int64
witnesses
1 258
0 249
2 249
3 243
2 1
Name: count, dtype: int64
police_report_available
? 343
NO 343
YES 313
YES.1 1
Name: count, dtype: int64
total_claim_amount
59400 5
2640 4
44200 4
5940 4
4320 4
..
87100 1
6240 1
66600 1
70920 1
67500 1
Name: count, Length: 764, dtype: int64
injury_claim
0 25
640 7
480 7
6340 5
580 5
..
14840 1
6580 1
11820 1
16650 1
7500 1
Name: count, Length: 639, dtype: int64
property_claim
0 19
860 6
480 5
660 5
10000 5
..
3590 1
6480 1
4580 1
4920 1
7500 1
Name: count, Length: 626, dtype: int64
vehicle_claim
5040 7
3360 6
44800 5
3600 5
33600 5
..
43360 1
25130 1
38940 1
47430 1
52500 1
Name: count, Length: 727, dtype: int64
auto_make
Saab 80
Dodge 80
Suburu 80
Nissan 78
Chevrolet 76
Ford 72
BMW 72
Toyota 70
Audi 69
Accura 68
Volkswagen 68
Jeep 67
Mercedes 65
Honda 55
Name: count, dtype: int64
auto_model
RAM 43
Wrangler 42
A3 37
Neon 37
MDX 36
Jetta 35
Passat 33
A5 32
Legacy 32
Pathfinder 31
Malibu 30
92x 28
Camry 28
Forrestor 28
F150 27
95 27
E400 27
93 25
Grand Cherokee 25
Escape 24
Tahoe 24
Maxima 24
Ultima 23
X5 23
Highlander 22
Civic 22
Silverado 22
Fusion 21
ML350 20
Impreza 20
Corolla 20
TL 20
CRV 20
C300 18
3 Series 18
X6 16
M5 15
Accord 13
RSX 12
Name: count, dtype: int64
auto_year
1995 56
1999 55
2005 54
2006 53
2011 53
2007 52
2003 51
2010 50
2009 50
2013 49
2002 49
2015 47
2012 46
1997 46
2008 45
2014 44
2000 42
2001 42
1998 40
2004 38
1996 37
2004 1
Name: count, dtype: int64
fraud_reported
N 753
Y 247
Name: count, dtype: int64
From the values we can observe, *
ins_data.drop(['umbrella_limit' , 'insured_zip'] , axis = 1 , inplace = True)
Let us retrieve the day, month and year from the features policy_bind_day and incident_date
ins_data['policy_bind_date']=pd.to_datetime(ins_data['policy_bind_date'])
ins_data['incident_date']=pd.to_datetime(ins_data['incident_date'])
<ipython-input-17-85a852d81472>:1: UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False (the default) was specified. Pass `dayfirst=True` or specify a format to silence this warning. ins_data['policy_bind_date']=pd.to_datetime(ins_data['policy_bind_date']) <ipython-input-17-85a852d81472>:2: UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False (the default) was specified. Pass `dayfirst=True` or specify a format to silence this warning. ins_data['incident_date']=pd.to_datetime(ins_data['incident_date'])
# Extracting Day, Month and Year column from policy_bind_date
ins_data['policy_bind_day'] = ins_data['policy_bind_date'].dt.day
ins_data['policy_bind_month'] = ins_data['policy_bind_date'].dt.month
ins_data['policy_bind_year'] = ins_data['policy_bind_date'].dt.year
# Extracting Day, Month and Year column from incident_date
ins_data['incident_day'] = ins_data['incident_date'].dt.day
ins_data['incident_month'] = ins_data['incident_date'].dt.month
ins_data['incident_year'] = ins_data['incident_date'].dt.year
ins_data.drop(['policy_bind_date','incident_date'],axis=1,inplace=True)
ins_data[ins_data.columns[(ins_data=="?").any()]].nunique()
collision_type 4 property_damage 3 police_report_available 4 dtype: int64
All these columns are categorical columns, so let us fill theose values with it's mode.
ins_data['collision_type'].value_counts()
collision_type Rear Collision 292 Side Collision 276 Front Collision 254 ? 178 Name: count, dtype: int64
ins_data['collision_type'] = ins_data.collision_type.str.replace('?' , ins_data['collision_type'].mode()[0])
ins_data['property_damage'].value_counts()
property_damage ? 360 NO 338 YES 302 Name: count, dtype: int64
We will fill the "?" in the feature property_damage with the value "NO"
ins_data['property_damage'] = ins_data.property_damage.str.replace('?' , 'NO')
ins_data['police_report_available'].value_counts()
police_report_available ? 343 NO 343 YES 313 YES.1 1 Name: count, dtype: int64
Let us fill the unknown value with NO
ins_data['police_report_available'] = ins_data.police_report_available.str.replace('?' , "NO")
The features policy_csl showing object data type but it contains numerical data,let us extratc csl_per_person and csl_per_accident from policy_csl column and then will convert their object data type into integer data type.
# Extracting csl_per_person and cls_per_accident from policy_csl column.
ins_data['csl_per_person'] = ins_data.policy_csl.str.split('/' , expand=True)[0]
ins_data['csl_per_accident'] = ins_data.policy_csl.str.split('/' , expand = True)[1]
# Converting object datatype into integer data type
ins_data['csl_per_person']=ins_data['csl_per_person'].astype('int64')
ins_data['csl_per_accident']=ins_data['csl_per_accident'].astype('int64')
# since we ectracted the data from policy_csl column , So let's rop it.
ins_data.drop('policy_csl' , axis = 1 , inplace = True)
ins_data['auto_year'] = ins_data['auto_year'].astype('int64')
# Lets extract age of the vehicle from auto_year by subtracting it from the year 2018
ins_data['Vehicle_Age']=2021-ins_data['auto_year']
ins_data.drop("auto_year",axis=1,inplace=True)
# checking for unique value present in dataset
ins_data.nunique().to_frame("Number of unique values")
| Number of unique values | |
|---|---|
| months_as_customer | 392 |
| age | 47 |
| policy_state | 3 |
| policy_deductible | 4 |
| policy_annual_premium | 991 |
| insured_sex | 2 |
| insured_education_level | 7 |
| insured_occupation | 14 |
| insured_hobbies | 20 |
| insured_relationship | 6 |
| capital-gains | 339 |
| capital-loss | 355 |
| incident_type | 4 |
| collision_type | 3 |
| incident_severity | 4 |
| authorities_contacted | 4 |
| incident_state | 7 |
| incident_city | 7 |
| incident_hour_of_the_day | 25 |
| number_of_vehicles_involved | 5 |
| property_damage | 2 |
| bodily_injuries | 4 |
| witnesses | 5 |
| police_report_available | 3 |
| total_claim_amount | 764 |
| injury_claim | 639 |
| property_claim | 626 |
| vehicle_claim | 727 |
| auto_make | 14 |
| auto_model | 39 |
| fraud_reported | 2 |
| policy_bind_day | 31 |
| policy_bind_month | 12 |
| policy_bind_year | 26 |
| incident_day | 31 |
| incident_month | 3 |
| incident_year | 1 |
| csl_per_person | 3 |
| csl_per_accident | 3 |
| Vehicle_Age | 21 |
The column incident_year has only 1 value , So we can drop that column.
# Dropping incident_year column
ins_data.drop('incident_year' , axis = 1 , inplace = True)
ins_data.head()
| months_as_customer | age | policy_state | policy_deductible | policy_annual_premium | insured_sex | insured_education_level | insured_occupation | insured_hobbies | insured_relationship | ... | auto_model | fraud_reported | policy_bind_day | policy_bind_month | policy_bind_year | incident_day | incident_month | csl_per_person | csl_per_accident | Vehicle_Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 328 | 48 | OH | 1000 | 1406.91 | MALE | MD | craft-repair | sleeping | husband | ... | 92x | Y | 17 | 10 | 2014 | 25 | 1 | 250 | 500 | 17 |
| 1 | 228 | 42 | IN | 2000 | 1197.22 | MALE | MD | machine-op-inspct | reading | other-relative | ... | E400 | Y | 27 | 6 | 2006 | 21 | 1 | 250 | 500 | 14 |
| 2 | 134 | 29 | OH | 2000 | 1413.14 | FEMALE | PhD | sales | board-games | own-child | ... | RAM | N | 6 | 9 | 2000 | 22 | 2 | 100 | 300 | 14 |
| 3 | 256 | 41 | IL | 2000 | 1415.74 | FEMALE | PhD | armed-forces | board-games | unmarried | ... | Tahoe | Y | 25 | 5 | 1990 | 10 | 1 | 250 | 500 | 7 |
| 4 | 228 | 44 | IL | 1000 | 1583.91 | MALE | Associate | sales | board-games | unmarried | ... | RSX | N | 6 | 6 | 2014 | 17 | 2 | 500 | 1000 | 12 |
5 rows × 39 columns
ins_data.shape
(1000, 39)
ins_data.info()
<class 'pandas.core.frame.DataFrame'> Index: 1000 entries, 0 to 999 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 months_as_customer 1000 non-null object 1 age 1000 non-null object 2 policy_state 1000 non-null object 3 policy_deductible 1000 non-null object 4 policy_annual_premium 1000 non-null object 5 insured_sex 1000 non-null object 6 insured_education_level 1000 non-null object 7 insured_occupation 1000 non-null object 8 insured_hobbies 1000 non-null object 9 insured_relationship 1000 non-null object 10 capital-gains 1000 non-null object 11 capital-loss 1000 non-null object 12 incident_type 1000 non-null object 13 collision_type 1000 non-null object 14 incident_severity 1000 non-null object 15 authorities_contacted 1000 non-null object 16 incident_state 1000 non-null object 17 incident_city 1000 non-null object 18 incident_hour_of_the_day 1000 non-null object 19 number_of_vehicles_involved 1000 non-null object 20 property_damage 1000 non-null object 21 bodily_injuries 1000 non-null object 22 witnesses 1000 non-null object 23 police_report_available 1000 non-null object 24 total_claim_amount 1000 non-null object 25 injury_claim 1000 non-null object 26 property_claim 1000 non-null object 27 vehicle_claim 1000 non-null object 28 auto_make 1000 non-null object 29 auto_model 1000 non-null object 30 fraud_reported 1000 non-null object 31 policy_bind_day 1000 non-null int32 32 policy_bind_month 1000 non-null int32 33 policy_bind_year 1000 non-null int32 34 incident_day 1000 non-null int32 35 incident_month 1000 non-null int32 36 csl_per_person 1000 non-null int64 37 csl_per_accident 1000 non-null int64 38 Vehicle_Age 1000 non-null int64 dtypes: int32(5), int64(3), object(31) memory usage: 293.0+ KB
Seperating the categorical and numerical column.
ins_data['months_as_customer'] = ins_data['months_as_customer'].astype('int64')
ins_data['age'] = ins_data['age'].astype("int64")
ins_data['policy_deductible'] = ins_data['policy_deductible'].astype("int64")
ins_data['capital-gains'] = ins_data['capital-gains'].astype('int64')
ins_data['number_of_vehicles_involved'] = ins_data['number_of_vehicles_involved'].astype('int64')
ins_data['incident_hour_of_the_day'] = ins_data['incident_hour_of_the_day'].astype("int64")
ins_data['witnesses'] = ins_data['witnesses'].astype("int64")
ins_data['injury_claim'] = ins_data['injury_claim'].astype('int64')
ins_data['property_claim'] = ins_data['property_claim'].astype('int64')
ins_data['vehicle_claim'] = ins_data['vehicle_claim'].astype('int64')
ins_data['policy_bind_day'] = ins_data['policy_bind_day'].astype('int64')
ins_data['policy_bind_month'] = ins_data['policy_bind_month'].astype('int64')
ins_data['policy_bind_year'] = ins_data['policy_bind_year'].astype('int64')
ins_data['incident_month'] = ins_data['incident_month'].astype('int64')
ins_data['Vehicle_Age'] = ins_data['Vehicle_Age'].astype('int64')
# Convert the 'bodily_injuries' column to integers, ignoring any non-integer values
ins_data['total_claim_amount'] = ins_data['total_claim_amount'].astype('int64')
ins_data['bodily_injuries'] = ins_data['bodily_injuries'].astype('int64', errors='ignore') # Convert to int, ignoring errors
ins_data['capital-loss'] = ins_data['capital-loss'].astype("int64" , errors='ignore')
ins_data.info('policy_annual_premium')
<class 'pandas.core.frame.DataFrame'> Index: 1000 entries, 0 to 999 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 months_as_customer 1000 non-null int64 1 age 1000 non-null int64 2 policy_state 1000 non-null object 3 policy_deductible 1000 non-null int64 4 policy_annual_premium 1000 non-null object 5 insured_sex 1000 non-null object 6 insured_education_level 1000 non-null object 7 insured_occupation 1000 non-null object 8 insured_hobbies 1000 non-null object 9 insured_relationship 1000 non-null object 10 capital-gains 1000 non-null int64 11 capital-loss 1000 non-null object 12 incident_type 1000 non-null object 13 collision_type 1000 non-null object 14 incident_severity 1000 non-null object 15 authorities_contacted 1000 non-null object 16 incident_state 1000 non-null object 17 incident_city 1000 non-null object 18 incident_hour_of_the_day 1000 non-null int64 19 number_of_vehicles_involved 1000 non-null int64 20 property_damage 1000 non-null object 21 bodily_injuries 1000 non-null object 22 witnesses 1000 non-null int64 23 police_report_available 1000 non-null object 24 total_claim_amount 1000 non-null int64 25 injury_claim 1000 non-null int64 26 property_claim 1000 non-null int64 27 vehicle_claim 1000 non-null int64 28 auto_make 1000 non-null object 29 auto_model 1000 non-null object 30 fraud_reported 1000 non-null object 31 policy_bind_day 1000 non-null int64 32 policy_bind_month 1000 non-null int64 33 policy_bind_year 1000 non-null int64 34 incident_day 1000 non-null int32 35 incident_month 1000 non-null int64 36 csl_per_person 1000 non-null int64 37 csl_per_accident 1000 non-null int64 38 Vehicle_Age 1000 non-null int64 dtypes: int32(1), int64(18), object(20) memory usage: 308.6+ KB
ins_data['policy_annual_premium'] = ins_data['policy_annual_premium'].astype("int64" , errors = 'ignore')
# Checking for categorical columns :
cat_col = []
for col in ins_data.dtypes.index :
if ins_data.dtypes[col] == 'object' :
cat_col.append(col)
print("Categorical columns are : \n" , cat_col)
print("\n")
# Checking for numerical columns
numerical_col = []
for i in ins_data.dtypes.index:
if ins_data.dtypes[i]!='object':
numerical_col.append(i)
print("Numerical columns are:\n",numerical_col)
print("\n")
Categorical columns are : ['policy_state', 'policy_annual_premium', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-loss', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 'property_damage', 'bodily_injuries', 'police_report_available', 'auto_make', 'auto_model', 'fraud_reported'] Numerical columns are: ['months_as_customer', 'age', 'policy_deductible', 'capital-gains', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'witnesses', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'policy_bind_day', 'policy_bind_month', 'policy_bind_year', 'incident_day', 'incident_month', 'csl_per_person', 'csl_per_accident', 'Vehicle_Age']
# checking for unique values in target columns :
ins_data['fraud_reported'].unique()
array(['Y', 'N'], dtype=object)
# Checking the list counts of target
ins_data['fraud_reported'].value_counts()
fraud_reported N 753 Y 247 Name: count, dtype: int64
As we can see that our dataset is imbalanced , so we need to balance it. We will use oversampling and balance the data later.
ins_data.describe()
| months_as_customer | age | policy_deductible | capital-gains | incident_hour_of_the_day | number_of_vehicles_involved | witnesses | total_claim_amount | injury_claim | property_claim | vehicle_claim | policy_bind_day | policy_bind_month | policy_bind_year | incident_day | incident_month | csl_per_person | csl_per_accident | Vehicle_Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.00000 | 1000.000000 | 1000.00000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
| mean | 203.954000 | 38.948000 | 1136.000000 | 25126.100000 | 11.644000 | 1.83900 | 1.487000 | 52761.94000 | 7433.420000 | 7399.570000 | 37928.950000 | 15.448000 | 6.559000 | 2001.604000 | 14.995000 | 1.496000 | 272.650000 | 580.200000 | 15.897000 |
| std | 115.113174 | 9.140287 | 611.864673 | 27872.187708 | 6.951373 | 1.01888 | 1.111335 | 26401.53319 | 4880.951853 | 4824.726179 | 18886.252893 | 8.808001 | 3.499824 | 7.360391 | 8.670995 | 0.523697 | 161.603196 | 287.420547 | 6.015861 |
| min | 0.000000 | 19.000000 | 500.000000 | 0.000000 | 0.000000 | 1.00000 | 0.000000 | 100.00000 | 0.000000 | 0.000000 | 70.000000 | 1.000000 | 1.000000 | 1990.000000 | 1.000000 | 1.000000 | 100.000000 | 300.000000 | 6.000000 |
| 25% | 115.750000 | 32.000000 | 500.000000 | 0.000000 | 6.000000 | 1.00000 | 1.000000 | 41812.50000 | 4295.000000 | 4445.000000 | 30292.500000 | 8.000000 | 3.000000 | 1995.000000 | 7.750000 | 1.000000 | 100.000000 | 300.000000 | 11.000000 |
| 50% | 199.500000 | 38.000000 | 1000.000000 | 0.000000 | 12.000000 | 1.00000 | 1.000000 | 58055.00000 | 6775.000000 | 6750.000000 | 42100.000000 | 16.000000 | 7.000000 | 2002.000000 | 15.000000 | 1.000000 | 250.000000 | 500.000000 | 16.000000 |
| 75% | 276.250000 | 44.000000 | 2000.000000 | 51025.000000 | 17.000000 | 3.00000 | 2.000000 | 70592.50000 | 11305.000000 | 10885.000000 | 50822.500000 | 23.000000 | 10.000000 | 2008.000000 | 22.000000 | 2.000000 | 500.000000 | 1000.000000 | 21.000000 |
| max | 479.000000 | 64.000000 | 2000.000000 | 100500.000000 | 23.000000 | 4.00000 | 3.000000 | 114920.00000 | 21450.000000 | 23670.000000 | 79560.000000 | 31.000000 | 12.000000 | 2015.000000 | 31.000000 | 3.000000 | 500.000000 | 1000.000000 | 26.000000 |
# Visualizing how many insurance claims is fraudulent.
print(ins_data['fraud_reported'].value_counts())
sns.countplot(x = ins_data['fraud_reported'] , palette = 'Set3')
plt.show()
fraud_reported N 753 Y 247 Name: count, dtype: int64
<ipython-input-47-61ed9e3952b8>:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['fraud_reported'] , palette = 'Set3')
# Visualizing how many policy states are present in the dataset.
print(ins_data['policy_state'].value_counts())
sns.countplot(x = ins_data['policy_state'] , palette = 'PuRd')
plt.show()
policy_state OH 352 IL 338 IN 310 Name: count, dtype: int64
<ipython-input-48-110852217e78>:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['policy_state'] , palette = 'PuRd')
# Visualizing count of customers insured education level are present in the dataset
print(ins_data['insured_education_level'].value_counts())
sns.countplot(x = ins_data['insured_education_level'],palette='PuRd')
plt.show()
insured_education_level JD 161 High School 160 Associate 145 MD 144 Masters 143 PhD 125 College 122 Name: count, dtype: int64
<ipython-input-49-2cff38011a92>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['insured_education_level'],palette='PuRd')
# Visualizing count of customers insured occupation are present in the dataset
print(ins_data['insured_occupation'].value_counts())
sns.countplot(x = ins_data['insured_occupation'],palette='RdBu')
plt.xticks(rotation=90)
plt.show()
insured_occupation machine-op-inspct 93 prof-specialty 85 tech-support 78 sales 76 exec-managerial 76 craft-repair 74 transport-moving 72 other-service 71 priv-house-serv 71 armed-forces 69 adm-clerical 65 protective-serv 63 handlers-cleaners 54 farming-fishing 53 Name: count, dtype: int64
<ipython-input-50-d2c2f57722ab>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['insured_occupation'],palette='RdBu')
# Visualizing count of insured sex are present in the dataset
print(ins_data['insured_hobbies'].value_counts())
sns.countplot(x = ins_data['insured_hobbies'],palette='PuBu')
plt.xticks(rotation =90)
plt.show()
insured_hobbies reading 64 exercise 57 paintball 57 bungie-jumping 56 movies 55 golf 55 camping 55 kayaking 54 yachting 53 hiking 52 video-games 50 skydiving 49 base-jumping 49 board-games 48 polo 47 chess 46 dancing 43 sleeping 41 cross-fit 35 basketball 34 Name: count, dtype: int64
<ipython-input-51-b8c66a2fd3d0>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['insured_hobbies'],palette='PuBu')
# Visualizing count of what kind of relationship the isured cutomers are in the dataset
print(ins_data['insured_relationship'].value_counts())
sns.countplot(x = ins_data['insured_relationship'],palette='PRGn')
plt.xticks(rotation = 45)
plt.show()
insured_relationship own-child 183 other-relative 177 not-in-family 174 husband 170 wife 155 unmarried 141 Name: count, dtype: int64
<ipython-input-52-9bfd221db3fb>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['insured_relationship'],palette='PRGn')
# Visualizing count of insured sex are present in the dataset
print(ins_data['incident_type'].value_counts())
sns.countplot(x = ins_data['incident_type'],palette='Spectral')
plt.xticks(rotation = 45)
plt.show()
incident_type Multi-vehicle Collision 419 Single Vehicle Collision 403 Vehicle Theft 94 Parked Car 84 Name: count, dtype: int64
<ipython-input-53-aed4044a4c24>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['incident_type'],palette='Spectral')
# Visualizing count of insured sex are present in the dataset
print(ins_data['collision_type'].value_counts())
sns.countplot(x = ins_data['collision_type'],palette='OrRd')
plt.show()
collision_type Rear Collision 470 Side Collision 276 Front Collision 254 Name: count, dtype: int64
<ipython-input-54-732d5b267353>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x = ins_data['collision_type'],palette='OrRd')
plt.figure(figsize = (25 , 35) , facecolor = 'white')
plotnumber = 1
for col in numerical_col :
if plotnumber<=23 :
ax = plt.subplot(8,3,plotnumber)
sns.distplot(ins_data[col] , hist=False)
plt.xlabel(col , fontsize = 20)
plotnumber+=1
plt.show()
<ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False) <ipython-input-55-f5ea0f468172>:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(ins_data[col] , hist=False)
From the above distribution plots:
# Comparing poicy_state and fraud_reported
plt.figure(figsize = (10 , 8))
sns.countplot(x = ins_data['policy_state'] , data = ins_data , hue = 'fraud_reported')
plt.title("POLICY_STATES vs FRAUD_REPORTED")
plt.show()
Here we can see that Fraud report is the highest in "OH".
# Comparing insured_education_level and fraud_reported.
plt.figure(figsize = (10 , 8))
sns.countplot(x = ins_data['insured_education_level'] , data = ins_data , hue = ins_data['fraud_reported'])
plt.title("EDUCATION LEVEL vs FRAUD REPORTED")
plt.show()
# Comparing insured_occupation vs Fraud_reported.
plt.figure(figsize = (10 , 8))
sns.countplot(x = ins_data['insured_occupation'] , data = ins_data , hue = ins_data['fraud_reported'])
plt.xticks(rotation = 90)
plt.show()
The people who are in the position exec-managerial have high fraud reports compared to others.
# Comparing insured_hobbies and fraud_reported.
plt.figure(figsize = (10 , 8))
sns.countplot(x = ins_data['insured_hobbies'] , hue = ins_data['fraud_reported'] , data = ins_data)
plt.xticks(rotation = 90)
plt.show()
The fraud report is high for the people who have the hobby of playing chess and cross fit.
# Comparing insured_relationships and fraud_reported
sns.countplot(x = ins_data['insured_relationship'],data=ins_data,hue=ins_data['fraud_reported'],palette='Set2')
plt.xticks(rotation=90)
plt.show()
The fraud report is high for the customers who have other relative and it is less for unmarried people.
# Comparing incident_type and fraud_reported
sns.countplot(x = ins_data['incident_type'],data=ins_data,hue=ins_data['fraud_reported'],palette='Set2')
plt.xticks(rotation=45)
plt.show()
The fraud reported when the type of incident is Multivehicle collision and single vehicle collision is high compared to Vehicle theft and parked vehicles.
# Comparing collision_type and fraud_reported
sns.countplot(x = ins_data['collision_type'],data=ins_data,hue=ins_data['fraud_reported'],palette='Set2')
plt.show()
The fraud reported is high when the collision type is Rear Collision. When the collision type is Side and front the fraud reported is similar.
# Comparing incident_Severeity and fraud_reported
sns.countplot(x = ins_data['incident_severity'],data=ins_data, hue=ins_data['fraud_reported'] ,palette='Set2')
plt.show()
The fraud report is high when the type of damage is Major damage and fraud commited is the least when the type of damage Trivial Damage.
# Comparing authorities_contacted and fraud_reported
sns.countplot(x = ins_data['authorities_contacted'],data=ins_data,hue=ins_data['fraud_reported'],palette='Set1')
plt.show()
The police contacted cases are very high.
# Comparing incident_state and fraud_reported
sns.catplot(x=ins_data['incident_state'],data=ins_data,kind='count',col=ins_data['fraud_reported'],palette='RdBu')
plt.show()
<ipython-input-65-c0849494b9e1>:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.catplot(x=ins_data['incident_state'],data=ins_data,kind='count',col=ins_data['fraud_reported'],palette='RdBu')
Most fradulant cases have taken place in the state "SC" followed by "NY" and the least in the state "PA".
# Comparig incident_city and fraud_reported
sns.catplot(x = ins_data['incident_city'],kind='count',data=ins_data,hue=ins_data['fraud_reported'],palette="bright")
plt.xticks(rotation=90)
plt.show()
The cities Riverwood and Northbrook have very less fraud reports compared to others.
# Comparing property_damage and fraud_reported
sns.catplot(x = ins_data['property_damage'],kind='count',data=ins_data,hue=ins_data['fraud_reported'],palette="crest")
plt.show()
The number of fraudulant cases reported when property damage is not there is higher.
# Comparing police_report_available and fraud_reported
sns.catplot(x = ins_data['police_report_available'],kind='count',data=ins_data,hue=ins_data['fraud_reported'],palette="bright")
plt.show()
The number of fraudulant cases are more when there are no police reports available.
fig,axes=plt.subplots(2,2,figsize=(12,10))
# Comparing insured_sex and age
sns.violinplot(x=ins_data['insured_sex'],y=ins_data['age'],ax=axes[0,0],data=ins_data,hue=ins_data["fraud_reported"],split=True)
# Comparing policy_state and witnesses
sns.violinplot(x=ins_data['policy_state'],y=ins_data['witnesses'],ax=axes[0,1],data=ins_data,hue=ins_data["fraud_reported"],split=True)
# Comparing csl_per_accident and property_claim
sns.violinplot(x=ins_data['csl_per_accident'],y=ins_data['property_claim'],ax=axes[1,0],data=ins_data,hue=ins_data["fraud_reported"],split=True)
# Comparing csl_per_person and age
sns.violinplot(x=ins_data['csl_per_person'],y=ins_data['age'],ax=axes[1,1],data=ins_data,hue=ins_data["fraud_reported"],split=True)
plt.show()
The fraud report is high for both the males-females having age between 30-45.\ The people who own the policy state "IN" have high fraud report.\ The person who has csl per accidemt insurance by claimimg property in the range 5000-15000 have the fraud report.\ The csl_per_person with age 30-45 are facing the fraudulent reports.
sns.pairplot(ins_data,hue="fraud_reported")
plt.show()